/* NoX (NoC Simulator)
 *
 * Dept. of Computer Science & Engineering, Pennsylvania State University.
 * All Rights Reserved.
 *  
 * 1. License     
 * NoX is distributed free of charge for academic, educational, noncommercial 
 * research purposes as long as this notice in its entirety is preserved in
 * every file included in this package.
 * All commercial use of this program requires separate licence. Contact the
 * author for details.
 * 
 * 2. All the publications that used the simulation results generated by the 
 * NoX should notify the author of the publication information and put 
 * following reference.
 *
 *  http://www.cse.psu.edu/~dpark/nox/
 * 
 * 3. Modification of the source code is permitted and encouraged as long as 
 * it follows the terms described in this copyright notice.
 *
 * 4. The author is not responsible for any problems caused by possible errors
 * of the NoX package. Therefore, users should verify the simulation result
 * before using it in their publication.
 *
 * Dept. of Computer Science & Engineering, Pennsylvania State University.
 * Contact: dpark@cse.psu.edu 
 * 
 * 6. If problems are found with the NoX package, please send an email to the
 * author for discussion and correction.

*/

/* Update History
 *
 *
 */

/* ROUTER.C - Router pipelines */


#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cassert>
#include "main.h"
#include "nic.h"
#include "link.h"
#include "router.h"
#include "router_common.h"
#include "route_adaptive.h"
#include "route_proximity_aware.h"
#include "route_simple_ft.h"
#include "shared.h"

#ifdef TR_INTEG
#undef INVALID
#include "defines.h"
#include "transaction.h"
#include "globals.h"
#undef INVALID
#define INVALID 1
#endif


void init_array2(int dim1, int dim2, int d1, int d2, int *arr);
void init_array3(int dim1, int dim2, int dim3, int d1, int d2, int d3, int *arr);
void init_array4(int dim1, int dim2, int dim3, int dim4, int d1, int d2, int d3, int d4, int *arr);
void print_array3(int dim1, int dim2, int dim3, int *arr);



void stage1()
{
  // This stage handles decoding and making routing decision.
  int node, pc, vc;
  int dest_node, dest_pc, dest_vc;
  char buf[5];
  int chk_error=0;
  double delay;
  static int last_used_vc[MAX_NODES][MAX_PC-MAX_NIC_PC] = {0};

  flit_t *flit_ptr;

  for(node=0; node<NUM_NODES; node++)
  {
    if(router[node].health_status == FAIL || router[node].health_status == TRAPPED)  
      continue;

    NUM_PC     = router_info[node].num_pc;
    NUM_NIC_PC = router_info[node].num_nic_pc;
    NUM_VC     = router_info[node].num_vc;

    for(pc=0; pc<NUM_PC; pc++)
    {
      for(vc=0; vc<NUM_VC; vc++)
      {
        router_occupancy[node] += msg_cnt(&(router_input_buf[node][pc][vc]));
        if( vc_info[node][pc][vc].vc_stat == VC_IDLE &&
            msg_cnt(&(router_input_buf[node][pc][vc])) > 0 )
        {
          // Receive flit data.
          if(verbose == YES)
          {
            printf("[%d][%d][%d] rinbuf :%d vc_stat:%s for current node\n",node,pc,vc,msg_cnt(&(router_input_buf[node][pc][vc])),
                vc_state[vc_info[node][pc][vc].vc_stat]);
            print_mbox(&(router_input_buf[node][pc][vc]));
          }

          read_flit(&(router_input_buf[node][pc][vc]), &flit_ptr);

          if(sql == YES)
            fprintf(fsql, "INSERT INTO FLOW VALUES(0, %d,%d,%d,%d,%d,%d);\n", 
                flit_ptr->flit_num, node, pc, vc, 1, sim_clock);

          if(verbose == YES) 
          { 
            printf("Dec/RT  [%d][%d][%d]-flit:%d(%s) at %lld\n", node, pc, vc, flit_ptr->flit_num, 
                (HEAD_FLIT)?"HEAD":(TAIL_FLIT)?"TAIL":"MIDDLE", (long long)sim_clock); 
            fflush(stdout);
          }

          // Do the routing (HEAD flit only).
          if(HEAD_FLIT)
          {
            /*if(flit_ptr->flit_num == 511)//574 && sim_clock > 1000)
              {
              printf("break here!\n");
              }*/

            //SPARE buffer logic for priority inversion
#ifdef TR_INTEG            
            if(flit_ptr->priority_id != -1)
              processor[flit_ptr->priority_id].priority_inversion_latency += flit_ptr->priority_inversion_cycles;
            flit_ptr->priority_inversion_cycles = 0;
#endif
            // Decide crossbar output PC with routing algorithm.
            dest_node = flit_ptr->data.dnode;
            //dest_pc = router->rtable[ph_chan][vr_chan].port_table[dest_node].op1_pc;

            // Add power consumed by routing logic.
            p_routing += PD_ROUTER;
            switch(routing_algorithm)
            {
              // By default, all the routing algorithms return both PC and VC.
              // If you want to use separate VA(Virtual Channel Allocator), you can simply
              // ignore the VC value returned by the routing functions and overwrite it with
              // the VC number assigned by the VA module, which should be implemented separately.
              case DT : {
                          int min_range, max_range, tmp_max, i, flag, next_node, next_pc;
                          // Deterministic routing... (X-Y routing) 
                          deterministic_route(node, dest_node, pc, vc, &dest_pc, &dest_vc, flit_ptr);
                          //VC allocation
                          dest_vc = get_best_vc(node, pc, vc, dest_pc, flit_ptr);
                          break;
                        }
              case WF : {
                          // West-First routing... 
                          west_first_route(node, dest_node, pc, vc, &dest_pc, &dest_vc);
                          break;
                        }

              case AD : {
                          // Fully Adaptive routing...

                          // General deadlock free routing (Duato)
                          // If this flit has waited more than ADAP_THRES cycles, switch to the 
                          // deterministic VC(s).
                          // This can be done by simply setting current VC to 0 (or 1 in TORUS) 
                          // and then call adaptive_route() function.
                          if( (sim_clock - flit_ptr->entry_time) % ADAP_THRES == 0)
                          {
                            if(topology == MESH)
                              adaptive_route(node, dest_node, pc, 0, &dest_pc, &dest_vc);

                            else if(topology == TORUS)
                            {
                              int sx, sy, dx, dy, tmp_vc;
                              calc_coord(node, &sx, &sy);
                              calc_coord(flit_ptr->data.dnode, &dx, &dy);
                              tmp_vc = (sx == dx)? ((flit_ptr->is_sy_less_than_dy == YES)? 1:0) :
                                ((flit_ptr->is_sx_less_than_dx == YES)? 1:0);
                              adaptive_route(node, dest_node, pc, tmp_vc, &dest_pc, &dest_vc);
                            }
                          }

                          else // Non starving flits...
                          {
                            // In MESH topology, we don't need to worry about VC number as in TORUS before.
                            // adaptive_route() function will automatically take care of this.
                            if(topology == MESH)
                              adaptive_route(node, dest_node, pc, vc, &dest_pc, &dest_vc);
                            else if(topology == TORUS)
                            {
                              if(vc >= 2) // Using Adaptive Routing until now.
                                adaptive_route(node, dest_node, pc, vc, &dest_pc, &dest_vc);
                              else // (vc == 0 or 1, in other words, Using deterministic escape path.)
                              { // Since we need two VCs for deadlock freedom, need to check 
                                // which of them to use.
                                int cx, cy, dx, dy, tmp_vc;
                                calc_coord(node, &cx, &cy);
                                calc_coord(flit_ptr->data.dnode, &dx, &dy);
                                tmp_vc = (cx == dx)? ((flit_ptr->is_sy_less_than_dy == YES)? 1:0) : vc;
                                adaptive_route(node, dest_node, pc, tmp_vc, &dest_pc, &dest_vc);
                              }
                            }// else
                          }
                          break;			      
                        }
              case PA : {
                          // Not Complete!! - Do not use this for performance evaluation.
                          // Proximity Aware Fault Tolerant routing...
                          if( (sim_clock - flit_ptr->entry_time) % ADAP_THRES == 0)
                            proximity_aware_route(node, dest_node, pc, 0, &dest_pc, &dest_vc);
                          else // Non starving flits...
                            proximity_aware_route(node, dest_node, pc, vc, &dest_pc, &dest_vc);
                          break;
                        }
              case SF : {
                          // Not Complete!! - Do not use this for performance evaluation.
                          // Simple Fault Tolerant routing...
                          if( (sim_clock - flit_ptr->entry_time) % ADAP_THRES == 0)
                            simple_ft_route(node, dest_node, pc, 0, &dest_pc, &dest_vc);
                          else // Non starving flits...
                            simple_ft_route(node, dest_node, pc, vc, &dest_pc, &dest_vc);
                          break;
                        }
            }// switch

            // Update router information

            // If arch <= 2, we perform look-ahead routing, where a routing decision for current node
            // is made in the previous node.
            // 
            // To correctly implement the look-ahead routing, each router need to perform routing in advance
            // and embed that information into the head flit and send it to the next node.
            // 
            // To simply implement this, we just perform routing at the current node (instead of at a previous node)
            // and treat it as if it is sent from the previous node.  
            // To do this, simply mark vc_stat to VC_ROUTING instead of VC_ROUTING_TMP.
            // 
            // Although this is not a correct way of implementing a look-ahead routing (since credit information
            // might be slightly different at the time of routing in both cases), it seems to be tolerable (see below). 
            // 
            // Consider following cases:
            // 1) DT with R->P:
            //    Routing decision of both correct and simplified implementation are excactly the same with 
            //    each other since routing algorithm does not consider credit information.
            // 2) DT with R->PV:
            //    Routing will consider credit, but it's just for determining best VC. (PC does not change 
            //    in both implementation types). Thus, it's very likely that both routing decisions are the same.
            // 3) Any adaptive routing with R->P:
            //    Routing will consider credit for PC. However, both routing decisions are expected to be similar 
            //    unless the credit of candidate out-PCs were similar but suddenly change, which seem very unlikely.
            // 4) Any adaptive routing with R->PV:
            //    Routing will consider credit for both PC and VC. This is the worst case. However, considering 
            //    cases 2) and 3) above, the routing decision of both implementation are still expected to be similar 
            //    although it's worse than cases 2) and 3).
            // The assumption of cases 2)~4) is that credit information does not change that frequently, which is
            // reasonable in both 2-stage and 1-stage pipelines (note that lock-ahead routing is required only when 
            // arch <=2).
            //
            vc_info[node][pc][vc].vc_stat = (arch <= 2)? VC_ROUTING : VC_ROUTING_TMP;


            // Bypass VA for single flit packets doesnot help at all because of
            // lool ahead routing
            //if(flit_ptr->flit_type == CONTROL && arch <=2)
            //vc_info[node][pc][vc].vc_stat = VC_VA_DONE;

            vc_info[node][pc][vc].out_pc=dest_pc; // Modify routing algorithms to return multiple PCs
            vc_info[node][pc][vc].out_vc=dest_vc; // Modify routing algorithms to return multiple VCs

            if(vc_info[node][pc][vc].priority_id == flit_ptr->priority_id)
              flit_ptr->interference_cycles += sim_clock - flit_ptr->entry_time;
            vc_info[node][pc][vc].priority_id = flit_ptr->priority_id; // Modify routing algorithms to return multiple VCs
            
            if(verbose == YES) 
            {
              if(dest_pc < NUM_PC-NUM_NIC_PC) 
              {
                printf("Dec/RT Done for [%d][%d][%d]-flit:%d(%s) to dest node:%d dest pc:%d dest vc:%d at %lld\n", node, pc, vc, flit_ptr->flit_num, 
                    (HEAD_FLIT)?"HEAD":(TAIL_FLIT)?"TAIL":"MIDDLE", neighbor[node][dest_pc], dest_pc,dest_vc,(long long)sim_clock); 
                printf("[%d][%d][%d] \n",neighbor[node][dest_pc],neighbor_pc[node][dest_pc],dest_vc);
                print_mbox(&(router_input_buf[neighbor[node][dest_pc]][neighbor_pc[node][dest_pc]][dest_vc]));
              }
              else
                printf("Dec/RT Done for [%d][%d][%d]-flit:%d(%s) to dest node:%d dest pc:%d dest vc:%d at %lld\n", node, pc, vc, flit_ptr->flit_num, 
                    (HEAD_FLIT)?"HEAD":(TAIL_FLIT)?"TAIL":"MIDDLE", node, dest_pc,dest_vc,(long long)sim_clock); 

              fflush(stdout);
            }
            if(dest_pc < 0)
            {printf("Routing function malfunction!\n");exit(1);}

          }// if HEAD_FLIT
        }// if( msg_cnt(&(router_input_buf[node][pc][vc])) > 0 )
      }// for vc
    }// for pc
  }// for node
}


int need_stage1_arb[MAX_NODES][MAX_PC][MAX_VC]={0}; 
int need_stage2_arb[MAX_NODES][MAX_PC][MAX_VC]={0}; 
int VA_stage1_req[MAX_NODES][MAX_PC][MAX_VC][MAX_VC]={0}; // Used only when R->P
int VA_stage2_req[MAX_NODES][MAX_PC][MAX_VC][MAX_PC*MAX_VC]={0}; // Used always
int VA_stage2_req_table[MAX_NODES][MAX_PC][MAX_VC][MAX_PC*MAX_VC]={0}; // Used always

long long VA_stage1_pri[MAX_NODES][MAX_PC][MAX_VC][MAX_VC]={0}; // Used only when R->P
long long VA_stage2_pri[MAX_NODES][MAX_PC][MAX_VC][MAX_PC*MAX_VC]={0}; // Used always
long long VA_stage1_age[MAX_NODES][MAX_PC][MAX_VC][MAX_VC]={0}; // Used only when R->P
long long VA_stage2_age[MAX_NODES][MAX_PC][MAX_VC][MAX_PC*MAX_VC]={0}; // Used always

void VA_stage() // Virtual Channel Allocator
{
  // 1. R->P (requires two-stage arbitration, V:1 -> PV:1)
  // NOT IMPLEMENTED
  // Stage 1 - V:1 arbitration per each input VC
  // Among candidate output VCs for a flit returned by routing algorithm, select only 
  // one output VC request.
  // 
  // Stage 2 - PV:1 arbitration per each output VC
  // Among the winners of the stage 1 (one from each input VC), select only one flit 
  // per each output VC.
  //
  // stage1_req -> stage1_win -> Generate stage2_req -> stage2_win
  // 
  //
  // 2. R->V (requires single-stage PV:1 arbitration. Use stage2_req/stage2_win table).
  // IMPLEMENTED
  // (We can think that stage 1 has already done by routing algorithm since it returns
  // only one candidate VC per each flit.)
  // Among the flits in input VCs, select only one flit per each output VC.
  // stage2_req -> stage2_win 

  // Begin Virtual Channel Arbitration.
  int node, in_pc, in_vc, out_pc, out_vc, out_pc_cand[MAX_PC], out_vc_cand[MAX_PC];
  int next_node, next_pc, i, j;
  // Generate Request & Grant Table.
  static int VA_stage1_win[MAX_NODES][MAX_PC][MAX_VC][MAX_VC]={0}; // Used only when R->P
  static int VA_stage2_win[MAX_NODES][MAX_PC][MAX_VC][MAX_PC*MAX_VC*MAX_PRI_LEVELS]={0}; // Used always

  flit_t *flit_ptr;

  if(verbose == YES)
    printf("Begin VA Stage**********\n");
  int RT_type = R_V;
  int VA_type1 = ARB_TYPE;
  int VA_type2 = ARB_TYPE;

  for(node=0; node<NUM_NODES; node++)
  {
    NUM_PC     = router_info[node].num_pc;
    NUM_NIC_PC = router_info[node].num_nic_pc;
    NUM_VC     = router_info[node].num_vc;
    for(out_pc=0; out_pc<NUM_PC; out_pc++)
    {
      int requesting_node = neighbor[node][out_pc];
      if(requesting_node <= EDGE)
        continue;
      int NUM_IN_PC       = router_info[requesting_node].num_pc;
      int NUM_IN_VC       = router_info[requesting_node].num_vc;
      for(out_vc=0; out_vc<NUM_VC; out_vc++)
      {
        //=======================================================================
        //Stall Time Fairness Support
        //=======================================================================
        VA_win_id[node][out_pc][out_vc] = -1;
        need_stage1_arb[node][out_pc][out_vc] = 0;
        need_stage2_arb[node][out_pc][out_vc] = 0;
        for(in_vc=0; in_vc<NUM_IN_VC*NUM_IN_PC; in_vc++)
        {
          VA_stage1_req[node][out_pc][out_vc][in_vc] = 0;
          VA_stage2_req[node][out_pc][out_vc][in_vc] = 0;
          VA_stage2_req_table[node][out_pc][out_vc][in_vc] = 0;
        }// all in vc's
      }// out vc
    }//out pc
  }//node

  if(RT_type == R_V)
  { // VA requires single (PV:1) arbitration stage per VC. (Use stage2 variables only)
    // ----------------------------------------------
    // 1. Initialize arbitration request table
    // ----------------------------------------------
    for(node=0; node<NUM_NODES; node++)
    {
      NUM_PC     = router_info[node].num_pc;
      NUM_NIC_PC = router_info[node].num_nic_pc;
      NUM_VC     = router_info[node].num_vc;

      for(in_pc=0; in_pc<NUM_PC; in_pc++)
        for(in_vc=0; in_vc<NUM_VC; in_vc++)
          if(vc_info[node][in_pc][in_vc].vc_stat == VC_ROUTING)
          {
            read_flit(&(router_input_buf[node][in_pc][in_vc]), &flit_ptr);
            
            out_pc = vc_info[node][in_pc][in_vc].out_pc; 
            /************************************************/
            //Do VC Allocation every cycle
            /************************************************/
            vc_info[node][in_pc][in_vc].out_vc = get_best_vc(node, in_pc, in_vc, out_pc, flit_ptr);
            out_vc = vc_info[node][in_pc][in_vc].out_vc; 
           
            VA_stage2_req[node][out_pc][out_vc][in_pc*NUM_VC+in_vc] = 1;
            if(VA_type2 == PR)
            {
              if(slack_based_arbitration)
              {
                float age = sim_clock - flit_ptr->msg_inj_tm; 
                float slack = flit_ptr->slack;
                //Please note some offseting has to be done because because pri
                //array is long long cannot contain fractions
                VA_stage2_pri[node][out_pc][out_vc][in_pc*NUM_VC+in_vc] =  (long long)(age > slack ? age/slack : -10000*(age/slack)); 
              }
              else if(age_based_arbitration)
                VA_stage2_pri[node][out_pc][out_vc][in_pc*NUM_VC+in_vc] = sim_clock - flit_ptr->msg_inj_tm; 
              else
                VA_stage2_pri[node][out_pc][out_vc][in_pc*NUM_VC+in_vc] = 
                  flit_ptr->priority;
               VA_stage2_age[node][out_pc][out_vc][in_pc*NUM_VC+in_vc] = sim_clock - flit_ptr->msg_inj_tm; 
               if(enable_app_age)
                VA_stage2_age[node][out_pc][out_vc][in_pc*NUM_VC+in_vc] = sim_clock - flit_ptr->app_inj_tm; 
            }
            if(verbose == YES)
            {
              float age = sim_clock - flit_ptr->msg_inj_tm; 
              printf("request table VA 2  [%d][%d][%d] to pc,out_vc [%d][%d] flit:%d pri id:%d pri:%lld batch id:%lld last batch id:%lld @clock %lld slack:%.2f age:%.2f\n",
                  node,in_pc,in_vc, out_pc,out_vc,flit_ptr->flit_num,
                  flit_ptr->priority,
                  VA_stage2_pri[node][out_pc][out_vc][in_pc*NUM_VC+in_vc],
                  flit_ptr->batch_id, last_batch_id[node],sim_clock,flit_ptr->slack,age);
              //print_vc_state(node);
            }

            need_stage2_arb[node][out_pc][out_vc] = 1;
          }// if
    }

    // ----------------------------------------------
    // 2. Now, do the arbitration.
    // ----------------------------------------------
    for(node=0; node<NUM_NODES; node++)
    {

      NUM_PC     = router_info[node].num_pc;
      NUM_NIC_PC = router_info[node].num_nic_pc;
      NUM_VC     = router_info[node].num_vc;


      for(out_pc=0; out_pc<NUM_PC; out_pc++)
      {
        int requesting_node = neighbor[node][out_pc];
        if(requesting_node <= EDGE)
          continue;
        int NUM_IN_PC       = router_info[requesting_node].num_pc;
        int NUM_IN_VC       = router_info[requesting_node].num_vc;

        for(out_vc=0; out_vc<NUM_VC; out_vc++)
          if(need_stage2_arb[node][out_pc][out_vc] == 1)
          { // VA requires single PV:1 arbitration stage per output VC. (Use stage2 variables only)

            //if(verbose == YES)
            //printf("VA stage 2 arbitration for [%d][%d][%d]\n",node,out_pc,out_vc);
            if(VA_type2 == RR) // Round-Robin Arbiter
              RR_arbiter (VA_stage2_req[node][out_pc][out_vc], 
                  VA_stage2_win[node][out_pc][out_vc], NUM_IN_PC, NUM_IN_VC);
            else if(VA_type2 == LRU) // Matrix (LRU) Arbiter
              LRU_arbiter(VA_stage2_req[node][out_pc][out_vc], 
                  VA_stage2_win[node][out_pc][out_vc], NUM_IN_PC, NUM_IN_VC);
            else if(VA_type2 == PR) // Priority RR Arbiter
              PR_arbiter(VA_stage2_req[node][out_pc][out_vc], 
                  VA_stage2_win[node][out_pc][out_vc], 
                  VA_stage2_pri[node][out_pc][out_vc],
                  VA_stage2_age[node][out_pc][out_vc], NUM_IN_PC, NUM_IN_VC);

            /*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*/
            // Power
            p_vc_arb += router_info[requesting_node].pd_pv_1_arb + router_info[requesting_node].pl_pv_1_arb; 
            /*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*/
          }// if
      }
    }
  }// if(RT_type == R_V)


  // -------------------------------------------------
  // Now, arbitration done. update router information
  // -------------------------------------------------
  for(node=0; node<NUM_NODES; node++)
  {
    NUM_PC     = router_info[node].num_pc;
    NUM_VC     = router_info[node].num_vc;
    for(out_pc=0; out_pc<NUM_PC; out_pc++)
    {
      int requesting_node = neighbor[node][out_pc];
      if(requesting_node <= EDGE)
        continue;
      int NUM_IN_PC       = router_info[requesting_node].num_pc;
      int NUM_IN_VC       = router_info[requesting_node].num_vc;
      for(out_vc=0; out_vc<NUM_VC; out_vc++)
        if(need_stage2_arb[node][out_pc][out_vc] == 1)
        {
          for(i=0; i<NUM_IN_PC*NUM_IN_VC; i++)
          {  
            if(VA_stage2_req[node][out_pc][out_vc][i] == 1)
            {
              in_pc = i / NUM_IN_VC;
              in_vc = i % NUM_IN_VC;
              if(vc_info[node][in_pc][in_vc].vc_stat != VC_ROUTING)
                continue;// Only when head flits
              vc_info[node][in_pc][in_vc].vc_stat = VC_VA_DONE_TMP;
              vc_info[node][in_pc][in_vc].out_pc  = out_pc;
              vc_info[node][in_pc][in_vc].out_vc  = out_vc;
              read_flit(&(router_input_buf[node][in_pc][in_vc]), &flit_ptr);
              //=======================================================================
              //Stall Time Fairness Support
              //=======================================================================
              VA_win_id[node][out_pc][out_vc] = flit_ptr->priority_id;

              if(verbose == YES) 
              { 
                printf("VA      [%d][%d][%d]-flit:%d(%s) to [%d][%d][%d] [rank/batch][%d/%d]\n", node, 
                    in_pc, in_vc, flit_ptr->flit_num, 
                    (HEAD_FLIT)? "HEAD" :
                    (TAIL_FLIT)? "TAIL" :"MIDDLE",
                    node, out_pc, out_vc, 
                    flit_ptr->priority, flit_ptr->batch_id); 

                next_node = neighbor[node][out_pc];
                next_pc = neighbor_pc[node][out_pc];
                printf("rinbuf [%d][%d][%d] :%d\n",next_node,next_pc,out_vc,is_ready[next_node][next_pc][out_vc].rinbuf);
                print_mbox(&(router_input_buf[next_node][next_pc][out_vc]));
                fflush(stdout);
              }
            }// if a winner
          } //all input vc's
        }// all output vc's
    }//all output pc's
  }//all nodes

for(node=0; node<NUM_NODES; node++)
{
  NUM_PC     = router_info[node].num_pc;
  NUM_VC     = router_info[node].num_vc;
  for(out_pc=0; out_pc<NUM_PC; out_pc++)
    for(out_vc=0; out_vc<NUM_VC; out_vc++)
      if(need_stage2_arb[node][out_pc][out_vc] == 1)
      {
        int requesting_node = neighbor[node][out_pc];
        if(requesting_node <= EDGE)
          continue;
        int NUM_IN_PC       = router_info[requesting_node].num_pc;
        int NUM_IN_VC       = router_info[requesting_node].num_vc;
        for(i=0; i<NUM_IN_PC*NUM_IN_VC; i++)
          if(VA_stage2_req_table[node][out_pc][out_vc][i] == 1)
          {
            in_pc = i / NUM_IN_VC;
            in_vc = i % NUM_IN_VC;
            read_flit(&(router_input_buf[node][in_pc][in_vc]), &flit_ptr);
            if(flit_ptr->priority_id == VA_win_id[node][out_pc][out_vc])
            {
              flit_ptr->interference_cycles++;
              flit_ptr->sa_cycles++;
            }
          }
      }
}

}

int SA_stage1_req[MAX_NODES][MAX_PC][MAX_VC]={0};
int SA_stage2_req[MAX_NODES][MAX_PC][MAX_PC]={0}; 
int SA_stage2_bus_req[MAX_NODES][MAX_PC*MAX_PC]={0}; 
int SA_stage1_req_table[MAX_NODES][MAX_PC][MAX_VC]={0}; 
int SA_stage2_req_table[MAX_NODES][MAX_PC][MAX_PC]={0}; 

int need_stage1_SA_arb[MAX_NODES][MAX_PC]={0}; 
int need_stage2_SA_arb[MAX_NODES][MAX_PC]={0}; 
int need_stage2_bus_arb[MAX_NODES]={0}; 

int SA_stage1_spec_req[MAX_NODES][MAX_PC][MAX_VC]={0};
int SA_stage2_spec_req[MAX_NODES][MAX_PC][MAX_PC]={0}; 
int SA_stage2_bus_spec_req[MAX_NODES][MAX_PC*MAX_PC]={0}; 

int need_stage1_spec_arb[MAX_NODES][MAX_PC]={0}; 
int need_stage2_spec_arb[MAX_NODES][MAX_PC]={0}; 
int need_stage2_bus_spec_arb[MAX_NODES]={0}; 

long long SA_stage1_pri[MAX_NODES][MAX_PC][MAX_VC]={0};
long long SA_stage2_pri[MAX_NODES][MAX_PC][MAX_PC]={0}; 
long long SA_stage1_age[MAX_NODES][MAX_PC][MAX_VC]={0};
long long SA_stage2_age[MAX_NODES][MAX_PC][MAX_PC]={0}; 

long long SA_stage1_spec_pri[MAX_NODES][MAX_PC][MAX_VC]={0};
long long SA_stage2_spec_pri[MAX_NODES][MAX_PC][MAX_PC]={0}; 
long long SA_stage1_spec_age[MAX_NODES][MAX_PC][MAX_VC]={0};
long long SA_stage2_spec_age[MAX_NODES][MAX_PC][MAX_PC]={0}; 

int SA_xshare_req[MAX_NODES][MAX_PC][MAX_PC*MAX_VC]={0};
int SA_xshare_win[MAX_NODES][MAX_PC][MAX_PC*MAX_VC]={0};

void SA_stage() // Switch Allocator
{
  // A separable allocator is implemented.
  // Stage 1 is the input port arbitration and stage2 is the output port arbitration.
  //
  // Stage 1 - V:1 arbitration per each input PC
  // Among flits in different VCs in the same PC, select only one.
  // 
  // Stage 2 - P:1 arbitration per each output PC
  // Among the winners of the stage 1 (one from each PC), select only one flit per each 
  // output PC.

  // Begin Switch Arbitration.
  int node, in_pc, in_vc, out_pc, out_vc, next_node, next_pc, i, j;
  flit_t *flit_ptr;

  // Generate Request & Grant Table.
  // 1. Non-Speculative 
  static int SA_stage1_win[MAX_NODES][MAX_PC][MAX_VC*MAX_PRI_LEVELS]={0}; 
  static int SA_stage2_win[MAX_NODES][MAX_PC][MAX_PC*MAX_PRI_LEVELS]={0}; 
  static int SA_stage2_bus_win[MAX_NODES][MAX_PC*MAX_PC]={0}; 

  // 2. Speculative
  static int SA_stage1_spec_win[MAX_NODES][MAX_PC][MAX_VC*MAX_PRI_LEVELS]={0}; 
  static int SA_stage2_spec_win[MAX_NODES][MAX_PC][MAX_PC*MAX_PRI_LEVELS]={0}; 
  static int SA_stage2_bus_spec_win[MAX_NODES][MAX_PC*MAX_PC]={0}; 


  int request_type, vc_stat, msg_exist;
  int is_in_vc_used [MAX_PC], is_out_vc_used[MAX_PC];


  int SA_type1 = ARB_TYPE;
  int SA_type2 = ARB_TYPE;
  int path_reserved;

  if(verbose == YES)
    printf("Begin SA Stage**********\n");
  //Stat for flit combination potential
  //int flits_with_same_out_pc[MAX_NODES][MAX_PC] = {0};

  // SA requires two arbitration stages. 
  // (V:1 arbitration per input PC ->  P:1 arbitration per output PC.) 
  // ----------------------------------------------
  // 1. Initialize arbitration request table
  // ----------------------------------------------
  for(node=0; node<NUM_NODES; node++)
  {
    NUM_PC     = router_info[node].num_pc;
    NUM_NIC_PC = router_info[node].num_nic_pc;
    NUM_VC     = router_info[node].num_vc;
    //=======================================================================
    // XShare support 
    //=======================================================================
    for(out_pc=0; out_pc<NUM_PC; out_pc++)
      for(in_pc=0; in_pc<NUM_PC; in_pc++)
        for(in_vc=0; in_vc<NUM_VC; in_vc++)
          SA_xshare_req[node][out_pc][in_pc*NUM_VC + in_vc] = 0;
    //=======================================================================
    // XShare support end 
    //=======================================================================


    for(in_pc=0; in_pc<NUM_PC; in_pc++)
    {
      need_stage1_spec_arb[node][in_pc]      = 0;
      need_stage1_SA_arb[node][in_pc]        = 0;
      for(in_vc=0; in_vc<NUM_VC; in_vc++)
      {
        //Initialize request tables and arbitration tables
        SA_stage1_spec_req[node][in_pc][in_vc] = 0;
        SA_stage1_req[node][in_pc][in_vc]      = 0;
        SA_stage1_req_table[node][in_pc][in_vc] = 0;
        //=======================================================================
        //Stall Time Fairness Support
        //=======================================================================
        SA_win_id[node][in_pc] = -1;

        request_type = 0; //0: no requst, 1: speculative request, 2: non-speculative request

        vc_stat = vc_info[node][in_pc][in_vc].vc_stat;
        msg_exist = (msg_cnt(&(router_input_buf[node][in_pc][in_vc])) > 0)? YES:NO;
        out_pc = vc_info[node][in_pc][in_vc].out_pc; 
        out_vc = vc_info[node][in_pc][in_vc].out_vc; 

        if(arch <= 3) // Enable speculative-SA.
        { 
          // VCs with status 'VC_VA_DONE_TMP' and 'VC_ROUTING' need to be considered during the speculation.
          // Even if we know which requests win the VA in the simulator since VA stage is performed before SA stage,
          // in hardware, both VA and SA operate in parallel. 
          // So, actual speculation can result in non-optimal and therefore, we need to pretend that we don't know
          // the winner of VA stage. 
          // All we have to do is to generate requests in the VCs that are in 'VC_ROUTING' state at the beginning of 
          // this cycle and this includes both 'VC_VA_DONE_TMP (VA winners)' and 'VC_ROUTING (VA losers)'.
          if( ((vc_stat == VC_VA_DONE_TMP) || (vc_stat == VC_ROUTING) ) && msg_exist == YES)
            request_type = 1; // Speculative request
        }

        // For both Speculative and Non-speculative cases.
        if(vc_stat == VC_VA_DONE && msg_exist == YES)
          request_type = 2; // Non-speculative request


        if( request_type > 0)  
        { // Check all the VCs and set the requests.

          read_flit(&(router_input_buf[node][in_pc][in_vc]), &flit_ptr);
          out_pc = vc_info[node][in_pc][in_vc].out_pc; 
          out_vc = vc_info[node][in_pc][in_vc].out_vc; 

          //Stat for flit combination potential
          //if(flit_ptr->msgtype == CONTROL)
          //flits_with_same_out_pc[node][out_pc]++;


          //-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
          // Check whether this flit is starving.
          if( (out_pc < NUM_PC-NUM_NIC_PC) && 
              (routing_algorithm == AD) && 
              ((sim_clock - flit_ptr->entry_time) % ADAP_THRES == 0) && 
              (HEAD_FLIT))
          { // If so, use deterministic escape channel (Physical channel can also be changed).
            // For simplicity of the code, call adaptive_route() function.
            // Note that we set the VC value to 0 in the function call below to indicate that
            // this flit needs to follow the escape channel.
            int dest_node, dest_pc, dest_vc;

            dest_node = flit_ptr->data.dnode;
            if(topology == MESH)
              adaptive_route(node, dest_node, in_pc, 0, &dest_pc, &dest_vc);
            else if(topology == TORUS)
            {
              int sx, sy, dx, dy, tmp_vc;
              calc_coord(node, &sx, &sy);
              calc_coord(dest_node, &dx, &dy);
              tmp_vc = (sx == dx)? ((flit_ptr->is_sy_less_than_dy == YES)? 1:0) :
                ((flit_ptr->is_sx_less_than_dx == YES)? 1:0);
              adaptive_route(node, dest_node, in_pc, tmp_vc, &dest_pc, &dest_vc);
            }


            vc_info[node][in_pc][in_vc].vc_stat = VC_VA_DONE_TMP;
            vc_info[node][in_pc][in_vc].out_pc  = dest_pc; // Modify routing algorithms to return multiple PCs
            vc_info[node][in_pc][in_vc].out_vc  = dest_vc; // Modify routing algorithms to return multiple VCs
            num_escape_msgs++;

            continue; // We need to skip this flit at current cycle.
          }
          //-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-



          //-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
          // Check for invalid transfer.
          path_reserved = (
              is_ready[node][in_pc ][in_vc ].xbarin  == NO ||
              is_ready[node][out_pc][out_vc].xbarout == NO 
              )? YES : NO;

          if(HEAD_FLIT)
          {
            if( is_ready[node][in_pc ][in_vc ].xbarin == NO )
            {
              // Head flit is sent to a reserved input path. Should not happen.
              printf("Head to reserved path. Bad.[%d][%d][%d]: flit:%d\n", node, in_pc, in_vc, flit_ptr->flit_num);
              print_mbox(&(router_input_buf[node][in_pc][in_vc]));
              fflush(stdout);
              exit(0);
            }
            else if( is_ready[node][out_pc ][out_vc ].xbarout == NO )
            { 
              // Another head flit has already reserved the output channel. Need to wait.

              if(out_pc < NUM_PC-NUM_NIC_PC)
              {
                next_node = neighbor[node][out_pc];
                next_pc = neighbor_pc[node][out_pc];
              }
              else
              {
                next_node = node;
                next_pc = out_pc;
              }
              if(flit_ptr->priority_id == vc_info[next_node][next_pc][out_vc].priority_id)
              {
                flit_ptr->interference_cycles++;
                flit_ptr->buff_full_interference_cycles++;
              }

              if(verbose == YES)
              {
                read_flit(&(router_input_buf[node][in_pc][in_vc]), &flit_ptr);
                printf("OUTVC_XBAR_NA[%d][%d][%d] -> [%d][%d][%d], flit:%d(%d) to [%d][%d][%d] %s\n", node, in_pc, in_vc, 
                    node, out_pc, out_vc, flit_ptr->flit_num, sim_clock-flit_ptr->entry_time,next_node,next_pc, out_vc,
                    vc_state[vc_info[next_node][next_pc][out_vc].vc_stat]); // Next router input buffer is not available.
                print_mbox(&(router_input_buf[next_node][next_pc][out_vc]));
                fflush(stdout);
              }
              continue;
            }
          }
          else if( !HEAD_FLIT && path_reserved == NO )
          {
            // Non-head flit is sent to a non-reserved path. Should not happen.
            printf("Non-Head to non-reserved path. Bad.[%d][%d][%d]: flit:%d\n", node, in_pc, in_vc, flit_ptr->flit_num);
            fflush(stdout);
            exit(0);
          }
          //-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-



          //-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
          //Check buffer availablilty in next router
          if(out_pc < NUM_PC-NUM_NIC_PC)
          {
            next_node = neighbor[node][out_pc];
            next_pc = neighbor_pc[node][out_pc];

            if(ATOMIC_BUFFER)
            {
              //atomic buffer allocation
              //comment out this if statement for non-atomic buffer allocation

              if(HEAD_FLIT && is_ready[next_node][next_pc ][out_vc ].xbarin == NO)
              { // Another head flit has already reserved the output channel. Need to wait.
                if(verbose == YES)
                {
                  printf("SA_RT_BNA[%d][%d][%d] -> [%d][%d][%d], flit:%d(%d) to [%d][%d][%d] %s\n", node, in_pc, in_vc, 
                      next_node, next_pc, out_vc, flit_ptr->flit_num, sim_clock-flit_ptr->entry_time,next_node,next_pc, out_vc,
                      vc_state[vc_info[next_node][next_pc][out_vc].vc_stat]); // Next router input buffer is not available.
                  print_mbox(&(router_input_buf[next_node][next_pc][out_vc]));
                  fflush(stdout);
                }
                int next_priority = get_max_priority(next_node,next_pc);
                if(next_priority != -1 && flit_ptr->priority > next_priority)
                  flit_ptr->priority_inversion_cycles++;
                continue;
              }

              //atomic buffer allocation
              //comment out this if statement for non-atomic buffer allocation

              if(HEAD_FLIT && is_ready[next_node][next_pc][out_vc].rinbuf < ROUTER_INPUT_BUF_SIZE)
              { // Another head flit has already reserved the output channel. Need to wait.
                if(verbose == YES)
                {
                  printf("SA_RT_BNA[%d][%d][%d] -> [%d][%d][%d], flit:%d(%d) to [%d][%d][%d] %s\n", node, in_pc, in_vc, 
                      next_node, next_pc, out_vc, flit_ptr->flit_num, sim_clock-flit_ptr->entry_time,next_node,next_pc, out_vc,
                      vc_state[vc_info[next_node][next_pc][out_vc].vc_stat]); // Next router input buffer is not available.
                  print_mbox(&(router_input_buf[next_node][next_pc][out_vc]));
                  fflush(stdout);
                }
                int next_priority = get_max_priority(next_node,next_pc);
                if(next_priority != -1 && flit_ptr->priority > next_priority)
                  flit_ptr->priority_inversion_cycles++;
                continue;
              }
            }

            if(is_ready[next_node][next_pc][out_vc].rinbuf <= 0)
            {
              if(flit_ptr->priority_id == vc_info[next_node][next_pc][out_vc].priority_id)
              {
                flit_ptr->interference_cycles++;
                flit_ptr->buff_full_interference_cycles++;
              }
              if(verbose == YES)
              {
                printf("SA_RT_BNA[%d][%d][%d] -> [%d][%d][%d], flit:%d(%d) to [%d][%d][%d] %s\n", node, in_pc, in_vc, 
                    next_node, next_pc, out_vc, flit_ptr->flit_num, sim_clock-flit_ptr->entry_time,next_node,next_pc, out_vc,
                    vc_state[vc_info[next_node][next_pc][out_vc].vc_stat]); // Next router input buffer is not available.
                print_mbox(&(router_input_buf[next_node][next_pc][out_vc]));
                fflush(stdout);
              }
              continue;
            }// if
            if(PRIORITY_VC_SET == YES && is_ready[next_node][next_pc][out_vc].pinbuf <= 0)
            {
              if(verbose == YES)
              {
                printf("SA_RT_BNA[%d][%d][%d] -> [%d][%d][%d], flit:%d(%d) to [%d][%d][%d] %s  (pinbuf)\n", node, in_pc, in_vc, 
                    next_node, next_pc, out_vc, flit_ptr->flit_num, sim_clock-flit_ptr->entry_time,next_node,next_pc, out_vc,
                    vc_state[vc_info[next_node][next_pc][out_vc].vc_stat]); // Next router input buffer is not available.
                print_mbox(&(router_input_buf[next_node][next_pc][out_vc]));
                fflush(stdout);
              }
              continue;
            }// if

          }
          else // if(out_pc >= NUM_PC-NUM_NIC_PC)
          {
            if(is_ready[node][out_pc][out_vc].ninbuf == NO)
            {
              if(verbose == YES)
              {
                printf("SA_NI_BNA[%d][%d][%d] -> [%d][%d][%d], flit:%d(%d)\n", node, in_pc, in_vc, 
                    node, out_pc, out_vc, flit_ptr->flit_num, sim_clock-flit_ptr->entry_time); // NIC input buffer is not available.
                fflush(stdout);
              }
              continue;
            }// if
          }// else
          //-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

          //-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
          //Setup Request Tables
          if( request_type == 1 ) // Speculative requests  
          {
            SA_stage1_spec_req[node][in_pc][in_vc] = 1;
            SA_stage1_req_table[node][in_pc][in_vc] = 1;
            need_stage1_spec_arb[node][in_pc] = 1;

            if(SA_type2 == PR)
            {
              if(slack_based_arbitration)
              {
                //Please note some offseting has to be done because because pri
                //array is long long cannot contain fractions
                float age = sim_clock - flit_ptr->msg_inj_tm; 
                float slack = flit_ptr->slack;
                SA_stage1_spec_pri[node][in_pc][in_vc] = (long long)(age > slack ? age/slack : -10000*(age/slack)); 
              }
              else if(age_based_arbitration)
                SA_stage1_spec_pri[node][in_pc][in_vc] = sim_clock - flit_ptr->msg_inj_tm; 
              else
                SA_stage1_spec_pri[node][in_pc][in_vc] = 
                  flit_ptr->priority;
                
              SA_stage1_spec_age[node][in_pc][in_vc] = sim_clock - flit_ptr->msg_inj_tm; 
              if(enable_app_age)
                SA_stage1_spec_age[node][in_pc][in_vc] = sim_clock - flit_ptr->app_inj_tm; 
            }
            //=======================================================================
            // XShare support 
            //=======================================================================
            /*if(xshare == YES && flit_ptr->short_flit == YES) 
              {
              int out_pc = vc_info[node][in_pc][in_vc].out_pc; 
              SA_xshare_req[node][out_pc][in_pc*NUM_VC + in_vc] = 1;
              }*/
            //=======================================================================
            // XShare support end 
            //=======================================================================

            if(verbose == YES)
            {
              out_pc = vc_info[node][in_pc][in_vc].out_pc; 
              printf("request table spec SA 1  [%d][%d][%d] to pc [%d] flit:%d pri id:%lld pri:%d batch id:%lld last batch id:%lld @clock %lld\n",
                  node,in_pc,in_vc, out_pc,flit_ptr->flit_num,
                  flit_ptr->priority,SA_stage1_spec_pri[node][in_pc][in_vc], 
                  flit_ptr->batch_id, last_batch_id[node],(long long)sim_clock);
            }
          }
          else 
          {
            SA_stage1_req[node][in_pc][in_vc] = 1;
            need_stage1_SA_arb[node][in_pc] = 1;
            SA_stage1_req_table[node][in_pc][in_vc] = 1;
            if(SA_type2 == PR)
            {
              if(slack_based_arbitration)
              {
                //Please note some offseting has to be done because because pri
                //array is long long cannot contain fractions
                float age = sim_clock - flit_ptr->msg_inj_tm; 
                float slack = flit_ptr->slack;
                SA_stage1_pri[node][in_pc][in_vc] = (long long)(age > slack ? age/slack : -10000*(age/slack)); 
              }
              else if(age_based_arbitration)
                SA_stage1_pri[node][in_pc][in_vc] = sim_clock - flit_ptr->msg_inj_tm; 
							else
                SA_stage1_pri[node][in_pc][in_vc] = 
                  flit_ptr->priority;
               
              SA_stage1_age[node][in_pc][in_vc] = sim_clock - flit_ptr->msg_inj_tm; 
              if(enable_app_age)
                SA_stage1_age[node][in_pc][in_vc] = sim_clock - flit_ptr->app_inj_tm; 
            }
            //=======================================================================
            // XShare support 
            //=======================================================================
            if(xshare == YES && flit_ptr->short_flit == YES) 
            {
              int out_pc = vc_info[node][in_pc][in_vc].out_pc; 
              SA_xshare_req[node][out_pc][in_pc*NUM_VC + in_vc] = 1;
            }
            //=======================================================================
            // XShare support end 
            //=======================================================================
            if(verbose == YES)
            {
              out_pc = vc_info[node][in_pc][in_vc].out_pc; 
              printf("request table  SA 1  [%d][%d][%d] to pc [%d] flit:%d pri id:%d pri:%lld batch id:%lld lats batch id:%lld @clock %lld\n",
                  node,in_pc,in_vc,out_pc, flit_ptr->flit_num,
                  flit_ptr->priority_id,SA_stage1_pri[node][in_pc][in_vc], 
                  flit_ptr->batch_id, last_batch_id[node], (long long)sim_clock);
            }
          }
          //-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-

        }// if
      }// for in_vc, in_pc, node
    }
  }


  // ----------------------------------------------
  // 2. Now, do the stage1 arbitration.
  //    (V:1 arbitration per input PC)
  // ----------------------------------------------
  for(node=0; node<NUM_NODES; node++)
  {
    NUM_PC     = router_info[node].num_pc;
    NUM_NIC_PC = router_info[node].num_nic_pc;
    NUM_VC     = router_info[node].num_vc;
    for(in_pc=0; in_pc<NUM_PC; in_pc++)
    {
      // 1. Non-speculative
      if(need_stage1_SA_arb[node][in_pc] == 1)
      {
        if(verbose == YES)
          printf("SA stage 1 arbitration for [%d][%d]\n",node,in_pc,in_vc);
        if(SA_type1 == RR) // Round-Robin Arbiter
          RR_arbiter (SA_stage1_req[node][in_pc], 
              SA_stage1_win[node][in_pc], 1, NUM_VC);
        else if(SA_type1 == LRU) // Matrix (LRU) Arbiter
          LRU_arbiter(SA_stage1_req[node][in_pc],
              SA_stage1_win[node][in_pc], 1,  NUM_VC);
        else if(SA_type1 == PR) // Priority RR Arbiter
          PR_arbiter(SA_stage1_req[node][in_pc],
              SA_stage1_win[node][in_pc], 
              SA_stage1_pri[node][in_pc],
              SA_stage1_age[node][in_pc], 1,  NUM_VC);
        /*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*/
        // Power
        p_sw_arb += router_info[node].pd_v_1_arb + router_info[node].pl_v_1_arb; 
        /*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*/
      }// if 

      // 2. Speculative
      if(need_stage1_spec_arb[node][in_pc] == 1)
      {
        if(verbose == YES)
          printf("SA spec stage 1 arbitration for [%d][%d]\n",node,in_pc,in_vc);
        if(SA_type1 == RR) // Round-Robin Arbiter
          RR_arbiter (SA_stage1_spec_req[node][in_pc], 
              SA_stage1_spec_win[node][in_pc], 1, NUM_VC);
        else if(SA_type1 == LRU) // Matrix (LRU) Arbiter
          LRU_arbiter(SA_stage1_spec_req[node][in_pc],
              SA_stage1_spec_win[node][in_pc], 1,  NUM_VC);
        else if(SA_type1 == PR) // Matrix (LRU) Arbiter
          PR_arbiter(SA_stage1_spec_req[node][in_pc],
              SA_stage1_spec_win[node][in_pc], 
              SA_stage1_spec_pri[node][in_pc],
              SA_stage1_spec_age[node][in_pc], 1,  NUM_VC);
        /*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*/
        // Power
        p_sw_arb += router_info[node].pd_v_1_arb + router_info[node].pl_v_1_arb; 
        /*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*/
      }// if 
    }
  }

  // ----------------------------------------------
  // 3. Generate stage2 request table
  // ----------------------------------------------
  for(node=0; node<NUM_NODES; node++)
  {
    NUM_PC     = router_info[node].num_pc;
    NUM_NIC_PC = router_info[node].num_nic_pc;
    NUM_VC     = router_info[node].num_vc;
    if(node_switch[node] == BUS_SWITCH)
    {
      need_stage2_bus_arb[node] = 0;
      need_stage2_bus_spec_arb[node] = 0;
    }

    for(out_pc=0; out_pc<NUM_PC; out_pc++)
      for(in_pc=0; in_pc<NUM_PC; in_pc++)
      {
        //Intialize request tables and arbitration tables
        SA_stage2_req[node][out_pc][in_pc]      = 0;
        SA_stage2_spec_req[node][out_pc][in_pc] = 0;
        need_stage2_SA_arb[node][out_pc]        = 0;
        need_stage2_spec_arb[node][out_pc]      = 0;

        SA_stage2_req_table[node][out_pc][in_pc]      = 0;

        if(node_switch[node] == BUS_SWITCH)
        {    
          SA_stage2_bus_req[node][out_pc*NUM_PC + in_pc] = 0;
          SA_stage2_bus_spec_req[node][out_pc*NUM_PC + in_pc] = 0;
        }
      }

    for(in_pc=0; in_pc<NUM_PC; in_pc++)
      for(in_vc=0; in_vc<NUM_VC; in_vc++)
      {
        out_pc = vc_info[node][in_pc][in_vc].out_pc; 
        // 1. Non-speculative
        if(SA_stage1_req[node][in_pc][in_vc] == 1)
        { // If a VC in a PC has a request(winner of stage 1), 
          // generate a request for stage 2.
          SA_stage2_req[node][out_pc][in_pc] = 1;
          SA_stage2_pri[node][out_pc][in_pc] = SA_stage1_pri[node][in_pc][in_vc];
          need_stage2_SA_arb[node][out_pc] = 1;
          SA_stage2_req_table[node][out_pc][in_pc]      = 1;
          SA_stage2_age[node][out_pc][in_pc] = SA_stage1_age[node][in_pc][in_vc];

          if(node_switch[node] == BUS_SWITCH)
          {
            need_stage2_bus_arb[node] = 1;
            SA_stage2_bus_req[node][out_pc*NUM_PC + in_pc] = 1;
          }
          if(verbose == YES)
            printf("request table SA 2 : [%d][%d][%d] out_pc:%d\n",node,in_pc,in_vc,out_pc);
        }// if

        // 2. Speculative
        if(SA_stage1_spec_req[node][in_pc][in_vc] == 1)
        { // If a VC in a PC has a request(winner of stage 1), 
          // generate a request for stage 2.
          SA_stage2_spec_req[node][out_pc][in_pc] = 1;
          SA_stage2_spec_pri[node][out_pc][in_pc] = SA_stage1_spec_pri[node][in_pc][in_vc];
          SA_stage2_req_table[node][out_pc][in_pc]      = 1;
          need_stage2_spec_arb[node][out_pc] = 1;
          SA_stage2_spec_age[node][out_pc][in_pc] = SA_stage1_spec_age[node][in_pc][in_vc];
          
          if(node_switch[node] == BUS_SWITCH)
          {
            need_stage2_bus_spec_arb[node] = 1;
            SA_stage2_bus_spec_req[node][out_pc*NUM_PC + in_pc] = 1;
          }

          if(verbose == YES)
            printf("request table spec SA 2 : [%d][%d][%d] out_pc:%d\n",node,in_pc,in_vc,out_pc);
        }// if
      }
  }

  // ----------------------------------------------
  // 4. Now, do the stage2 arbitration.
  //    (P:1 arbitration per output PC)
  // ----------------------------------------------
  for(node=0; node<NUM_NODES; node++)
  {
    NUM_PC     = router_info[node].num_pc;
    NUM_NIC_PC = router_info[node].num_nic_pc;
    NUM_VC     = router_info[node].num_vc;
    // A. Xbar Switch
    if(node_switch[node] == XBAR_SWITCH)
    {
      for(out_pc=0; out_pc<NUM_PC; out_pc++)
      {
        // 1. Non-speculative
        if(need_stage2_SA_arb[node][out_pc] == 1)
        {
          //if(verbose == YES)
          //printf("SA stage 2 arbitration for %d for out_pc:%d\n",node,out_pc);
          if(SA_type2 == RR) // Round-Robin Arbiter
            RR_arbiter (SA_stage2_req[node][out_pc], 
                SA_stage2_win[node][out_pc], 1, NUM_PC);
          else if(SA_type2 == LRU) // Matrix (LRU) Arbiter
            LRU_arbiter(SA_stage2_req[node][out_pc], 
                SA_stage2_win[node][out_pc], 1, NUM_PC);
          else if(SA_type2 == PR) // Priority RR Arbiter
            PR_arbiter(SA_stage2_req[node][out_pc], 
                SA_stage2_win[node][out_pc],
                SA_stage2_pri[node][out_pc],
                SA_stage2_age[node][out_pc], 1, NUM_PC);
          /*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*/
          // Power
          p_sw_arb += router_info[node].pd_p_1_arb + router_info[node].pl_p_1_arb; 
          /*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*/
        }// if

        // 2. Speculative
        if(need_stage2_spec_arb[node][out_pc] == 1)
        {
          //if(verbose == YES)
          //printf("SA stage 2 spec arbitration for %d for out_pc:%d\n",node,out_pc);
          if(SA_type2 == RR) // Round-Robin Arbiter
            RR_arbiter (SA_stage2_spec_req[node][out_pc], 
                SA_stage2_spec_win[node][out_pc], 1, NUM_PC);
          else if(SA_type2 == LRU) // Matrix (LRU) Arbiter
            LRU_arbiter(SA_stage2_spec_req[node][out_pc], 
                SA_stage2_spec_win[node][out_pc], 1, NUM_PC);
          else if(SA_type2 == PR) // Priority RR Arbiter
            PR_arbiter(SA_stage2_spec_req[node][out_pc], 
                SA_stage2_spec_win[node][out_pc], 
                SA_stage2_spec_pri[node][out_pc],
                SA_stage2_spec_age[node][out_pc], 1, NUM_PC);
          /*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*/
          // Power
          p_sw_arb += router_info[node].pd_p_1_arb + router_info[node].pl_p_1_arb; 
          /*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*/
        }// if
      }
    }
    // B. Bus Switch
    if(node_switch[node] == BUS_SWITCH)
    {
      // 1. Non-speculative
      if(need_stage2_bus_arb[node] == 1)
      {
        //if(verbose == YES)
        //printf("SA stage 2 bus arbitration for node%d\n",node);
        if(SA_type2 == RR) // Round-Robin Arbiter
          RR_arbiter (SA_stage2_bus_req[node], 
              SA_stage2_bus_win[node], NUM_PC, NUM_PC);
        // else
        //{printf("BUS only RR arb supported!\n");exit(1);}
      }
      // 2. Speculative
      if(need_stage2_bus_spec_arb[node] == 1)
      {
        //if(verbose == YES)
        //printf("SA stage 2 spec bus arbitration for node%d\n",node);
        if(SA_type2 == RR) // Round-Robin Arbiter
          RR_arbiter (SA_stage2_bus_spec_req[node], 
              SA_stage2_bus_spec_win[node], NUM_PC, NUM_PC);
        //else
        //{printf("BUS only RR arb supported!\n");exit(1);}
      }

      //3. copy arbitration results to original request tables
      bool spec_req_required = true;
      for(out_pc=0; out_pc<NUM_PC; out_pc++)
        for(in_pc=0; in_pc<NUM_PC; in_pc++)
        {
          SA_stage2_req[node][out_pc][in_pc] = SA_stage2_bus_req[node][out_pc*NUM_PC + in_pc]; 
          if(SA_stage2_req[node][out_pc][in_pc] ==1)
            spec_req_required=false;
        }

      for(out_pc=0; out_pc<NUM_PC; out_pc++)
        for(in_pc=0; in_pc<NUM_PC; in_pc++)
        {
          if(spec_req_required)
            SA_stage2_spec_req[node][out_pc][in_pc] = SA_stage2_bus_spec_req[node][out_pc*NUM_PC + in_pc]; 
          else
            SA_stage2_spec_req[node][out_pc][in_pc] = 0; 
        }
      /*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*/
      // Power
      p_sw_arb += router_info[node].pd_p_1_arb + router_info[node].pl_p_1_arb; 
      /*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*/
    }// if
  }


  // -------------------------------------------------
  // Now, arbitration done. Reserve/Release path.
  // -------------------------------------------------
  for(node=0; node<NUM_NODES; node++)
  {
    NUM_PC     = router_info[node].num_pc;
    NUM_NIC_PC = router_info[node].num_nic_pc;
    NUM_VC     = router_info[node].num_vc;
    // Initialize PC usage variables.
    for(in_pc=0; in_pc<NUM_PC; in_pc++)
    {
      is_in_vc_used [in_pc]=NO;
      is_out_vc_used[in_pc]=NO;
    }

    //=======================================================================
    // XShare support 
    //=======================================================================
    if(xshare == YES)
    {
      for(out_pc=0; out_pc<NUM_PC; out_pc++)
      {
        int baseline_pc = -1;
        bool xshare_arb = false;

        for(in_pc=0; in_pc<NUM_PC; in_pc++)
          for(in_vc=0; in_vc<NUM_VC; in_vc++)
            if(SA_stage2_req[node][out_pc][in_pc] == 1 &&
                SA_stage1_req[node][in_pc][in_vc] == 1 &&
                vc_info[node][in_pc][in_vc].vc_stat == VC_VA_DONE )
            {
              read_flit(&(router_input_buf[node][in_pc][in_vc]), &flit_ptr);
              baseline_pc = in_pc;
              if(flit_ptr->short_flit == YES)
                xshare_arb = true;
              break;
            }
        if(xshare_arb == true)
        {
          XShare_arbiter(SA_xshare_req[node][out_pc],SA_xshare_win[node][out_pc],1,NUM_PC*NUM_VC);

          for(in_pc=0; in_pc<NUM_PC; in_pc++)
          {
            bool xshare_conflict = true;
            //If no flit has already been chosen from this in pc then we are
            //safe
            if(in_pc == baseline_pc)
              xshare_conflict=false;
            else if(SA_stage2_req[node][out_pc][in_pc] != 1) 
              xshare_conflict=false;
            //if any flit has been already be chosen we must make sure that
            //the already chosen flit is a short flit to all the new xshare
            //flit to transfer 
            /*else
              {
              for(int in_vc_old=0; in_vc_old<NUM_VC; in_vc_old++)
              if(SA_stage1_req[node][in_pc][in_vc_old] == 1)
              {
              read_flit(&(router_input_buf[node][in_pc][in_vc_old]), &flit_ptr);
              if(flit_ptr->short_flit == YES)
              {
              xshare_conflict=false;
              break;
              }
              }
              }*/
            int taken_out_vc[MAX_VC]={0};
            if(xshare_conflict == false)
              for(in_vc=0; in_vc<NUM_VC; in_vc++)
                if(SA_xshare_req[node][out_pc][in_pc*NUM_VC + in_vc] == 1)
                {
                  int out_vc = vc_info[node][in_pc][in_vc].out_vc;  
                  if(taken_out_vc[out_vc]==0)
                  {
                    SA_stage1_req[node][in_pc][in_vc] = 1;
                    taken_out_vc[out_vc]=1;
                  }
                }
          }//in pc
        }
      }//out pc
    }//xshare
    //=======================================================================
    // XShare support end 
    //=======================================================================
    // First, check non-speculative arbitration first.
    for(out_pc=0; out_pc<NUM_PC; out_pc++)
    {
      int win_id = -1;
      for(in_pc=0; in_pc<NUM_PC; in_pc++)
        if(SA_stage2_req[node][out_pc][in_pc] == 1)
          // SA_stage2_win contains winning input PCs
          // Need to find out which VC wins the first stage.
          for(in_vc=0; in_vc<NUM_VC; in_vc++)
            if( SA_stage1_req[node][in_pc][in_vc] == 1 &&
                vc_info[node][in_pc][in_vc].vc_stat == VC_VA_DONE )
            {
              out_vc = vc_info[node][in_pc][in_vc].out_vc;

              // Reserve (Release) path if the flit is Head (Tail) flit.
              recv_flit(&(router_input_buf[node][in_pc][in_vc]), &flit_ptr);

              if(verbose == YES) 
                //if(node == 62)
              { 
                printf("SA      [%d][%d][%d]-flit:%d(%s) to [%d][%d][%d] at %lld [src/rank/batch][%d/%d/%d]\n", node, 
                    in_pc, in_vc, flit_ptr->flit_num, 
                    (HEAD_FLIT)? "HEAD" :
                    (TAIL_FLIT)? "TAIL" :"MIDDLE",
                    node, out_pc, out_vc, sim_clock, 
                    flit_ptr->priority_id,flit_ptr->priority, flit_ptr->batch_id); 
                next_node = neighbor[node][out_pc];
                next_pc = neighbor_pc[node][out_pc];
                printf("rinbuf [%d][%d][%d] :%d\n",next_node,next_pc,out_vc,is_ready[next_node][next_pc][out_vc].rinbuf);
                print_mbox(&(router_input_buf[next_node][next_pc][out_vc]));
                fflush(stdout);
              }

              //=======================================================================
              //Stall Time Fairness Support
              //=======================================================================
              SA_win_id[node][out_pc] = flit_ptr->priority_id;
              SA_stage1_req_table[node][in_pc][in_vc] = 0;

              if(HEAD_FLIT)
              {
                // Reserve Path
                is_ready[node][in_pc ][in_vc ].xbarin  = NO;
                is_ready[node][out_pc][out_vc].xbarout = NO;
                
                if(out_pc >= NUM_PC - NUM_NIC_PC)
                  vc_info[node][out_pc][out_vc].priority_id = flit_ptr->priority_id;
                else if(vc_info[node][out_pc][out_vc].vc_stat == VC_IDLE)
                  vc_info[node][out_pc][out_vc].priority_id = flit_ptr->priority_id;

                if(verbose == YES)
                {
                  printf("xbarout reserving [%d][%d][%d] by flit:%d(%s)\n",node,out_pc,out_vc,flit_ptr->flit_num,"HEAD");
                  printf("xbarin reserving [%d][%d][%d] by flit:%d(%s)\n",node,in_pc,in_vc,flit_ptr->flit_num,"HEAD");
                }
                // single flit case
                if(TAIL_FLIT)
                {// Release Path
                  is_ready[node][in_pc ][in_vc ].xbarin  = YES;
                  is_ready[node][out_pc][out_vc].xbarout = YES;
                  if(verbose == YES)
                  {
                    printf("xbarin releasing [%d][%d][%d] by flit:%d(%s)\n",node,in_pc,in_vc,flit_ptr->flit_num,"TAIL");
                    printf("xbarout releasing [%d][%d][%d] by flit:%d(%s)\n",node,out_pc,out_vc,flit_ptr->flit_num,"TAIL");
                  }

                }

              }
              else if(TAIL_FLIT)
              {// Release Path
                is_ready[node][in_pc ][in_vc ].xbarin  = YES;
                is_ready[node][out_pc][out_vc].xbarout = YES;
                if(verbose == YES)
                {
                  printf("xbarin releasing [%d][%d][%d] by flit:%d(%s)\n",node,in_pc,in_vc,flit_ptr->flit_num,"TAIL");
                  printf("xbarout releasing [%d][%d][%d] by flit:%d(%s)\n",node,out_pc,out_vc,flit_ptr->flit_num,"TAIL");
                }

              }

              // Now, set the VC status to SA_ACTIVE_TMP
              vc_info[node][in_pc][in_vc].vc_stat = VC_SA_DONE_TMP;
              // for multicycle switch transfers

              if(out_pc < NUM_PC - NUM_NIC_PC)
              {
                // Set has_flit to YES. This value is used when checking is_ready[][][].rinbuf.
                has_flit[node][out_pc][out_vc].arbiter = YES;
              }

              // Now, this flit is ready to go.
              send_flit(&(xbar_buf[node][in_pc][in_vc]), &flit_ptr);

              // Mark PCs used
              // This will be used below when checking speculative SA.
              is_in_vc_used [in_pc ]=YES;
              is_out_vc_used[out_pc]=YES;
            }// if
    }

    if(arch <=3)
    {
      //=======================================================================
      // XShare support 
      //=======================================================================
      /*if(xshare == YES)
        {
        bool xshare_arb = false;
        int baseline_pc = -1;
        for(out_pc=0; out_pc<NUM_PC; out_pc++)
        {
        for(in_pc=0; in_pc<NUM_PC; in_pc++)
        for(in_vc=0; in_vc<NUM_VC; in_vc++)
        if(SA_stage2_spec_req[node][out_pc][in_pc] == 1 &&
        SA_stage1_spec_req[node][in_pc][in_vc] == 1 &&
        ( vc_info[node][in_pc][in_vc].vc_stat == VC_VA_DONE_TMP ||
        vc_info[node][in_pc][in_vc].vc_stat == VC_ROUTING) &&
        is_in_vc_used [in_pc ] == NO && is_out_vc_used[out_pc] == NO )
        {
        read_flit(&(router_input_buf[node][in_pc][in_vc]), &flit_ptr);
        baseline_pc = in_pc;
        if(flit_ptr->short_flit == YES)
        xshare_arb = true;
        break;
        }
        if(xshare_arb == true)
        {
        XShare_arbiter(SA_xshare_req[node][out_pc],SA_xshare_win[node][out_pc],1,NUM_PC*NUM_VC);

        for(in_pc=0; in_pc<NUM_PC; in_pc++)
        {
        bool xshare_conflict = true;
      //If no flit has already been chosen from this in pc then we are
      //safe
      if(in_pc == baseline_pc)
      xshare_conflict=false;
      else if(SA_stage2_spec_req[node][out_pc][in_pc] != 1) 
      xshare_conflict=false;
      //if any flit has been already be chosen we must make sure that
      //the already chosen flit is a short flit to all the new xshare
      //flit to transfer 
      else
      {
      for(int in_vc_old=0; in_vc_old<NUM_VC; in_vc_old++)
      if(SA_stage1_spec_req[node][in_pc][in_vc_old] == 1)
      {
      read_flit(&(router_input_buf[node][in_pc][in_vc_old]), &flit_ptr);
      if(flit_ptr->short_flit == YES)
      {
      xshare_conflict=false;
      break;
      }
      }
      }
      if(xshare_conflict == false)
      for(in_vc=0; in_vc<NUM_VC; in_vc++)
      if(SA_xshare_req[node][out_pc][in_pc*NUM_VC + in_vc] == 1)
      SA_stage1_spec_req[node][in_pc][in_vc] = 1;
      }//in pc
      }

      }
      }*/
      //=======================================================================
      // XShare support end 
      //=======================================================================

      // Now, check speculative arbitration to see if any of the speculative winners
      // can be chosen as final winners. (Only when both input and output PCs necessary 
      // for thespeculative winner are not assigned in the non-speculative arbitration above.)
      for(out_pc=0; out_pc<NUM_PC; out_pc++)
      {
        int win_id = -1;
        for(in_pc=0; in_pc<NUM_PC; in_pc++)
          // If there is a winner and both in_pc and out_pc for this winner is available
          // grant this speculative request.
          if(SA_stage2_spec_req[node][out_pc][in_pc] == 1 &&
              is_in_vc_used [in_pc ] == NO && is_out_vc_used[out_pc] == NO )
          {
            // SA_stage2_spec_win contains winning input PCs
            // Need to find out which VC wins the first stage.
            for(in_vc=0; in_vc<NUM_VC; in_vc++)
              if( SA_stage1_spec_req[node][in_pc][in_vc] == 1 &&
                  ( vc_info[node][in_pc][in_vc].vc_stat == VC_VA_DONE_TMP ||
                    vc_info[node][in_pc][in_vc].vc_stat == VC_ROUTING      )
                )
              {
                out_vc = vc_info[node][in_pc][in_vc].out_vc;

                // Reserve (Release) path if the flit is Head (Tail) flit.
                recv_flit(&(router_input_buf[node][in_pc][in_vc]), &flit_ptr);

                if(verbose == YES) 
                { 
                  printf("SA      [%d][%d][%d]-flit:%d(%s) to [%d][%d][%d] at %lld [src/rank/batch][%d/%d/%d]\n", node, 
                      in_pc, in_vc, flit_ptr->flit_num, 
                      (HEAD_FLIT)? "HEAD" :
                      (TAIL_FLIT)? "TAIL" :"MIDDLE",
                      node, out_pc, out_vc,sim_clock, 
                    flit_ptr->priority_id,flit_ptr->priority, flit_ptr->batch_id); 
                  next_node = neighbor[node][out_pc];
                  next_pc = neighbor_pc[node][out_pc];
                  printf("rinbuf [%d][%d][%d] :%d\n",next_node,next_pc,out_vc,is_ready[next_node][next_pc][out_vc].rinbuf);
                  print_mbox(&(router_input_buf[next_node][next_pc][out_vc]));
                  fflush(stdout);
                }
                if(HEAD_FLIT)
                {// Reserve Path
                  is_ready[node][in_pc ][in_vc ].xbarin  = NO;
                  is_ready[node][out_pc][out_vc].xbarout = NO;
                  if(verbose == YES)
                  {
                    printf("xbarout reserving [%d][%d][%d] by flit:%d(%s)\n",node,out_pc,out_vc,flit_ptr->flit_num,"HEAD");
                    printf("xbarin reserving [%d][%d][%d] by flit:%d(%s)\n",node,in_pc,in_vc,flit_ptr->flit_num,"HEAD");
                  }

                  if(TAIL_FLIT)
                  {// Release Path
                    is_ready[node][in_pc ][in_vc ].xbarin  = YES;
                    is_ready[node][out_pc][out_vc].xbarout = YES;
                    if(verbose == YES)
                    {
                      printf("xbarin releasing [%d][%d][%d] by flit:%d(%s)\n",node,in_pc,in_vc,flit_ptr->flit_num,"HEAD");
                      printf("xbarout releasing [%d][%d][%d] by flit:%d(%s)\n",node,out_pc,out_vc,flit_ptr->flit_num,"HEAD");
                    }
                  }
                }
                else if(TAIL_FLIT)
                {// Release Path
                  is_ready[node][in_pc ][in_vc ].xbarin  = YES;
                  is_ready[node][out_pc][out_vc].xbarout = YES;
                  if(verbose == YES)
                  {
                    printf("xbarin releasing [%d][%d][%d] by flit:%d(%s)\n",node,in_pc,in_vc,flit_ptr->flit_num,"TAIL");
                    printf("xbarout releasing [%d][%d][%d] by flit:%d(%s)\n",node,out_pc,out_vc,flit_ptr->flit_num,"TAIL");
                  }
                }

                // Now, set the VC status to SA_ACTIVE_TMP
                vc_info[node][in_pc][in_vc].vc_stat = VC_SA_DONE_TMP;
                // for multicycle switch transfers

                if(out_pc < NUM_PC - NUM_NIC_PC)
                {
                  // Set has_flit to YES. This value is used when checking is_ready[][][].rinbuf.
                  has_flit[node][out_pc][out_vc].arbiter = YES;
                }

                //=======================================================================
                //Stall Time Fairness Support
                //=======================================================================
                SA_win_id[node][out_pc] = flit_ptr->priority_id;
                SA_stage1_req_table[node][in_pc][in_vc] = 0;
                // Now, this flit is ready to go.
                send_flit(&(xbar_buf[node][in_pc][in_vc]), &flit_ptr);

                /*
                   We don't have to mark PCs here since speculative winners will not have
                   duplicate in_pc or out_pc.
                   */

                // Mark PCs used
                // This will be used below when checking speculative SA.
                //is_in_vc_used [in_pc ]=YES;
                //is_out_vc_used[out_pc]=YES;

              }// if

          }// if for speculative request.
      }
    }// if arch<=3
  }// for node
  //=======================================================================
  //Stall Time Fairness Support
  //=======================================================================
  for(node=0; node<NUM_NODES; node++)
  {
    NUM_PC     = router_info[node].num_pc;
    NUM_NIC_PC = router_info[node].num_nic_pc;
    NUM_VC     = router_info[node].num_vc;

    for(in_pc=0; in_pc<NUM_PC; in_pc++)
      for(in_vc=0; in_vc<NUM_VC; in_vc++)
        if(SA_stage1_req_table[node][in_pc][in_vc] == 1)
        {
          out_pc = vc_info[node][in_pc][in_vc].out_pc; 
          read_flit(&(router_input_buf[node][in_pc][in_vc]), &flit_ptr);
          if(flit_ptr->priority_id == SA_win_id[node][out_pc] && SA_win_id[node][out_pc])
          {
            flit_ptr->interference_cycles++;
            flit_ptr->sa_cycles++;
          }
        }
  }

}

void stage3()
{
  // This stage handles data transfer through the crossbar. 
  int node, in_pc, in_vc, i, j, out_pc, out_vc;
  int pc_candidate, vc_candidate, xbar_used;
  flit_t *flit_ptr;

  static int last_selected_pc[MAX_NODES]={0}, last_selected_vc[MAX_NODES][MAX_PC]={0};

  for(node=0; node<NUM_NODES; node++)
  {
    NUM_PC     = router_info[node].num_pc;
    NUM_NIC_PC = router_info[node].num_nic_pc;
    NUM_VC     = router_info[node].num_vc;

    xbar_used = NO;

    if(router[node].health_status == FAIL || router[node].health_status == TRAPPED)  
      continue;

    for(in_pc=0; in_pc< NUM_PC; in_pc++)
    {
      for(in_vc=0; in_vc< NUM_VC; in_vc++)
      {
        /*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*/
        //Error checking
        if(verbose == YES) 
        {
          out_pc = vc_info[node][in_pc][in_vc].out_pc;
          out_vc = vc_info[node][in_pc][in_vc].out_vc;

          if(msg_cnt(&(xbar_buf[node][in_pc][in_vc])) > 1)
          {
            read_flit(&(xbar_buf[node][in_pc][in_vc]), &flit_ptr);
            printf("Xbar  Overflow [%d][%d][%d]-flit:%d(%s) to %s\n", 
                node, out_pc, out_vc, flit_ptr->flit_num, 
                (HEAD_FLIT)? "HEAD" :
                (TAIL_FLIT)? "TAIL" : "MIDDLE", 
                (out_pc >= (NUM_PC-NUM_NIC_PC))? "Eject":"Next router"); 

            printf("[%d][%d][%d] rinbuf :%d vc_stat:%s\n",node,in_pc,in_vc,msg_cnt(&(router_input_buf[node][in_pc][in_vc])),
                vc_state[vc_info[node][in_pc][in_vc].vc_stat]);
            print_mbox(&(router_input_buf[node][in_pc][in_vc]));
            fflush(stdout);
          }

          if(vc_info[node][in_pc][in_vc].vc_stat == VC_SA_DONE_TMP1 
              && msg_cnt(&(xbar_buf[node][in_pc][in_vc])) > 0 )
          {
            read_flit(&(xbar_buf[node][in_pc][in_vc]), &flit_ptr);
            printf("Xbar  Waiting [%d][%d][%d]-flit:%d(%s) to %s\n", 
                node, out_pc, out_vc, flit_ptr->flit_num, 
                (HEAD_FLIT)? "HEAD" :
                (TAIL_FLIT)? "TAIL" : "MIDDLE", 
                (out_pc >= (NUM_PC-NUM_NIC_PC))? "Eject":"Next router"); 
            fflush(stdout);
          }
        }
        /*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*/



        if( vc_info[node][in_pc][in_vc].vc_stat == VC_SA_DONE &&
            msg_cnt(&(xbar_buf[node][in_pc][in_vc])) > 0)
        {
          // Do the data transfer through the crossbar.
          recv_flit(&(xbar_buf[node][in_pc][in_vc]), &flit_ptr);

          // If this is the tail flit, set VC status to idle.
          if(TAIL_FLIT)
            vc_info[node][in_pc][in_vc].vc_stat = VC_IDLE_TMP; // idle 
          else
            vc_info[node][in_pc][in_vc].vc_stat = VC_VA_DONE;   // VA done. 

          out_pc = vc_info[node][in_pc][in_vc].out_pc;
          out_vc = vc_info[node][in_pc][in_vc].out_vc;

          if(sql == YES)
            fprintf(fsql, "INSERT INTO FLOW VALUES(0, %d,%d,%d,%d,%d,%d);\n", 
                flit_ptr->flit_num, node, out_pc, out_vc, 3, sim_clock);
          if(verbose == YES) 
          { 
            printf("Xbar    [%d][%d][%d]-flit:%d(%s) to %s\n", 
                node, out_pc, out_vc, flit_ptr->flit_num, 
                (HEAD_FLIT)? "HEAD" :
                (TAIL_FLIT)? "TAIL" : "MIDDLE", 
                (out_pc >= (NUM_PC-NUM_NIC_PC))? "Eject":"Next router"); 
            int next_node = neighbor[node][out_pc];
            int next_pc = neighbor_pc[node][out_pc];
            print_mbox(&(router_input_buf[next_node][next_pc][out_vc]));
            fflush(stdout);
          } 

          vc_utilization[node][in_pc][in_vc]++;


          // Calculate network delay at this stage. If the difference between sim_clock 
          // and the entry_time (sim_clock - entry_time -1) is the same as # pipeline stage (=arch),
          // it means that this flit didn't experience any blocking.
          // If it is greater than 1, this difference indicates the blocked cycle.
          if(sim_clock - flit_ptr->entry_time - arch > 0)
            flit_ptr->delay_cycle += sim_clock - flit_ptr->entry_time - arch;

          //Two Cycle Bus Transfer should be accounted in network transfer not
          //in blocking delay
          if(node_switch[node] == BUS_SWITCH)
          {
            //flit_ptr->delay_cycle -= 1; 
            bus_delay_cycle += sim_clock - flit_ptr->entry_time - arch - 1;
            flit_ptr->bus_latency += sim_clock - flit_ptr->entry_time;
          }

          if(verbose == YES) 
          { 
            printf("Blocking Delay   [%d][%d][%d]-flit:%d(%s) accumulated:%d current:%d interference:%d arb:%d buff-full:%d\n", 
                node, out_pc, out_vc, flit_ptr->flit_num, 
                (HEAD_FLIT)? "HEAD" :
                (TAIL_FLIT)? "TAIL" : "MIDDLE",
                flit_ptr->delay_cycle,
                sim_clock - flit_ptr->entry_time - arch, flit_ptr->interference_cycles,
                flit_ptr->sa_cycles, flit_ptr->buff_full_interference_cycles);
            fflush(stdout);
          } 


          // increase HBH buffer power since a flit is written to HBH retransmission buffer 
          // when it leaves current router.
          if( retrans_type == HBH || 
              (retrans_type == HFEC && HEAD_FLIT) || 
              (retrans_type == HE2E && HEAD_FLIT)  ) 
            p_retrans += PD_HBHBUF; 

          // Keep track of the retransmission buffer usage.
          // Since a flit is sent to the HBH retransmission buffer as soon as
          // it is sent through the crossbar, we increase it here.
          if( (sim_clock > warmup_cycle) && 
              (retrans_type == HBH || 
               (retrans_type == HFEC && HEAD_FLIT) || 
               (retrans_type == HE2E && HEAD_FLIT)  )  )
            total_retrans_buf_usage[node]++;

          /*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*/
          // Power
          if(node_switch[node] == XBAR_SWITCH)
            p_xbar += PD_XBAR_F;
          //End Power
          /*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*/

          // Send flit data.
          if(out_pc >= NUM_PC - NUM_NIC_PC)
          {
            // This flit should be ejected at this node.
            send_flit(&(nic_input_buf[node][out_pc-(NUM_PC-NUM_NIC_PC)][out_vc]), &flit_ptr);
            /*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*/
            // Power
            if(node_switch[node] == BUS_SWITCH)
              p_link += PD_LINK_BUS;
            //End Power
            /*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*/
          }
          else
          {
            // Set has_flit to YES. This value is used when checking is_ready[][][].rinbuf.
            has_flit[node][out_pc][out_vc].xbar = YES;
            p_link += PL_LINK;

            flit_ptr->num_hop++;

            // This flit should be forwarded to another router. 
            send_flit(&(link_buf[node][out_pc][out_vc]), &flit_ptr);
            /*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*/
            // Hybrid Topology Support if going from local to global or global
            // to local do link stage in same cycle
            {
              int next_node = neighbor[node][out_pc];
              if(router_info[node].type == LOCAL || router_info[next_node].type == LOCAL)
                update_cnt(&(link_buf[node][out_pc][out_vc]));
            }
            /*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*/
          }// else

          // Keep track of the activity.
          if(sim_clock > warmup_cycle)
            total_activity[node]++;

        }// if
      }// for in_vc
    }// for in_pc
  }// for node
}

//=======================================================================
// Various Arbitration Schemes. 
//=======================================================================
int RR_arbiter(int *req, int *last_win, int n_set, int set_size)
{
  int i, j, k, new_set, new_idx;
  int last_win_set = 0; // Recent Winner Set
  //int *last_win_idx = (int *)calloc(set_size, sizeof(int)); // Index inside the Recent Winner Set
  int last_win_idx[MAX_PC*MAX_VC]={0}; // Index inside the Recent Winner Set

  // First of all, check last_winner.
  for(i=0; i<n_set; i++)
    for(j=0; j<set_size; j++)
      if(last_win[i*set_size + j] == 1)
      {
        last_win_set    = i; 
        last_win_idx[i] = j; 
      }

  // Now, arbitrate the requests.
  for(i=0; i<n_set; i++)
  {
    new_set = (i + 1 + last_win_set) % n_set;
    for(j=0; j<set_size; j++)
    {
      new_idx = (j + 1 + last_win_idx[i]) % set_size;
      if(req[new_set*set_size + new_idx] == 1)
      { // Found a new winner.

        // Now, update the last_winner information. 
        last_win[last_win_set*set_size + last_win_idx[last_win_set]] = 0;
        last_win[new_set*set_size + new_idx] = 1;

        // Now, we found a winner. Remove all the requests that lose this arbitration.
        for(k=0; k<n_set*set_size; k++)
          if(k != new_set*set_size + new_idx)
            req[k] = 0;

        return 1;
      }// if
    }// for j
  }// for i

  // There are no requests. 
  // Nothing changes.
  return -1;
}

//=======================================================================
// XShare support 
//=======================================================================
int XShare_arbiter(int *req, int *last_win, int n_set, int set_size)
{
  int i, j, k, new_set, new_idx;
  int last_win_set = 0; // Recent Winner Set
  int last_win_idx[MAX_PC*MAX_VC]={0}; // Index inside the Recent Winner Set

  int win_idx = -1;
  int win_set_idx[MAX_PC*MAX_VC]={0}; // Index inside the Recent Winner Set
  int winners=0;

  // First of all, check last_winner.
  for(i=0; i<n_set; i++)
    for(j=0; j<set_size; j++)
      if(last_win[i*set_size + j] == 1)
      {
        last_win_set    = i; 
        last_win_idx[i] = j; 
      }

  //If the unique winner was a short flit find other short flits 
  //which can go concurrently
  for(i=0; i<n_set; i++)
  {
    new_set = (i + 1 + last_win_set) % n_set;
    for(j=0; j<set_size; j++)
    {
      new_idx = (j + 1 + last_win_idx[i]) % set_size;
      if( req[new_set*set_size + new_idx] == 1 && 
          winners < xshare_allowed_xfers)
      { 
        // Found a new Xshare winner.
        win_set_idx[new_set*set_size + new_idx] = 1;
        winners++;
        if(winners == xshare_allowed_xfers)
          break;
      }// if
    }// for j
  }// for i

  // Now, we found a winner. Remove all the requests that lose this arbitration.
  if(winners > 0)
  {
    for(k=0; k<n_set*set_size; k++)
      if(win_set_idx[k] == 0)
        req[k] = 0;
    return 1;
  }

  // There are no requests. 
  // Nothing changes.
  return -1;
}
//=======================================================================
// XShare support end 
//=======================================================================


int PR_arbiter(int *req, int *last_win, long long *priority, long long *age, int n_set, int set_size)
{
  int i, j, k, new_set, new_idx, max_idx, max_set;
  long long curr_idx_priority=-1, max_idx_priority=-99999999;
  int last_win_set = 0; // Recent Winner Set
  int last_win_idx[MAX_PC*MAX_VC*MAX_PRI_LEVELS]={0}; // Index inside the Recent Winner Set
  int reqs=0;
  int pri_set_size = n_set * set_size;
  long long max_age = 0;

  // First of all, check last_winner.
  // Also find the maximum priority level among the requests.
  for(i=0; i<n_set; i++)
    for(j=0; j<set_size; j++)
    {
      if(verbose == YES)
        if(req[i*set_size + j] == 1)
          reqs++;

      curr_idx_priority = priority[i*set_size + j];
      if(req[i*set_size + j] == 1 && curr_idx_priority < -99999999)
      {
        printf(" curr_idx_priority error %lld\n",curr_idx_priority);
        exit(1);
      }

      //upate maximum priority in the request vector
      if(req[i*set_size + j] == 1 && curr_idx_priority > max_idx_priority)
        max_idx_priority = curr_idx_priority;

    }
  
  for(i=0; i<n_set; i++)
    for(j=0; j<set_size; j++)
    {
      //update last winner information for RR
      if(last_win[(max_idx_priority%num_priority_levels)*pri_set_size + i*set_size + j] == 1)
      {
        last_win_set    = i; 
        last_win_idx[i] = j; 
      }

    }

  if(verbose == YES)
    if(reqs > 1)
      printf("multiple requests in request table %d max_idx_priority:%lld set size:%d nsets:%d\n",reqs,max_idx_priority,set_size,n_set);

  max_idx = -1;
  max_set = -1;
  max_age =  0;

  //Now choose in a round robin fashion from all the requests which have
  //maximum priority (multiple requests may have same priority level)
  for(i=0; i<n_set; i++)
  {
    new_set = (i + 1 + last_win_set) % n_set;
    for(j=0; j<set_size; j++)
    {
      new_idx = (j + 1 + last_win_idx[i]) % set_size;
      curr_idx_priority = priority[new_set*set_size + new_idx];
      if(    req[new_set*set_size + new_idx] == 1        //valid request
          && curr_idx_priority == max_idx_priority       //belongs to highest priority level
          && age[new_set*set_size + new_idx] > max_age) //oldest among the highest priority request
      {
        // Found a new winner.
        max_idx = new_idx;
        max_set = new_set;
        max_age = age[new_set*set_size + new_idx]; 
        //if oldest first within a priority class is disabled 
        //return thye first request in highest priority class
        //after the last winner in the same priority class
        if(enable_age_PR == false)
          break;

      }
    }
  }

  if(max_idx != -1)
  {
    // Now, update the last_winner information. 
    last_win[(max_idx_priority%num_priority_levels)*pri_set_size + last_win_set*set_size + last_win_idx[last_win_set]] = 0;
    last_win[(max_idx_priority%num_priority_levels)*pri_set_size + max_set*set_size + max_idx] = 1;

    if(verbose == YES)
      printf("max idx : %d max pri : %d max age : %d\n", max_idx, max_idx_priority,max_age);

    // Now, we found a winner. Remove all the requests that lose this arbitration.
    for(k=0; k<n_set*set_size; k++)
      if(k != new_set*set_size + max_idx)
        req[k] = 0;
    return 1;
  }

  // There are no requests. 
  // Nothing changes.
  if(reqs > 1)
    printf("ERR No winner found in request table %d\n",reqs);
  return -1;
}

int LRU_arbiter(int *req, int *win, int num_set, int set_size)
{}

void init_array2(int dim1, int dim2, int d1, int d2, int *arr)
{
  int i,j;
  for(i = 0; i< d1; i++)
    for(j = 0; j< d2; j++)
      *(arr + i*dim2 + j)  = 0;
}

void init_array3(int dim1, int dim2, int dim3, int d1, int d2, int d3, int *arr)
{
  int i,j,k;
  for(i = 0; i< d1; i++)
    for(j = 0; j< d2; j++)
      for(k = 0; k< d3; k++)
        *(arr + i*dim2*dim3 + j*dim3 + k)  = 0;
}

void init_array4(int dim1, int dim2, int dim3, int dim4, int d1, int d2, int d3, int d4, int *arr)
{
  int i,j,k,l;
  for(i = 0; i< d1; i++)
    for(j = 0; j< d2; j++)
      for(k = 0; k< d3; k++)
        for(l = 0; l< d4; l++)
          *(arr + i*dim2*dim3*dim4 + j*dim3*dim4 + k*dim4 + l)  = 0;
}

void print_array3(int dim1, int dim2, int dim3, int *arr)
{
  for(int i = 0; i< dim1; i++)
    for(int j = 0; j< dim2; j++)
    {
      for(int k = 0; k< dim3; k++)
        printf("%d ", *(arr + i*dim2*dim3 + j*dim3 + k));
      printf("\n");
    }
}
